!pip install --upgrade pip
!pip install palmerpenguins==0.1.4 numpy==1.23.4 pandas==1.5.1 seaborn==0.12.1 matplotlib==3.6.0 empiricaldist==0.6.7 statsmodels==0.13.5 scikit-learn==1.1.2 pyjanitor==0.23.1 session-info
import empiricaldist
import janitor
import matplotlib.pyplot as plt
import numpy as np
import palmerpenguins
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn.metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as ss
import session_info
Matplotlib is building the font cache; this may take a moment.
%matplotlib inline
sns.set_style(style='whitegrid')
sns.set_context(context='notebook')
plt.rcParams['figure.figsize'] = (11, 9.4)
penguin_color = {
'Adelie': '#ff6602ff',
'Gentoo': '#0f7175ff',
'Chinstrap': '#c65dc9ff'
}
palmerpenguins¶raw_penguins_df = palmerpenguins.load_penguins_raw()
raw_penguins_df
| studyName | Sample Number | Species | Region | Island | Stage | Individual ID | Clutch Completion | Date Egg | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | Comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PAL0708 | 1 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A1 | Yes | 2007-11-11 | 39.1 | 18.7 | 181.0 | 3750.0 | MALE | NaN | NaN | Not enough blood for isotopes. |
| 1 | PAL0708 | 2 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A2 | Yes | 2007-11-11 | 39.5 | 17.4 | 186.0 | 3800.0 | FEMALE | 8.94956 | -24.69454 | NaN |
| 2 | PAL0708 | 3 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N2A1 | Yes | 2007-11-16 | 40.3 | 18.0 | 195.0 | 3250.0 | FEMALE | 8.36821 | -25.33302 | NaN |
| 3 | PAL0708 | 4 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N2A2 | Yes | 2007-11-16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Adult not sampled. |
| 4 | PAL0708 | 5 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N3A1 | Yes | 2007-11-16 | 36.7 | 19.3 | 193.0 | 3450.0 | FEMALE | 8.76651 | -25.32426 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | PAL0910 | 64 | Chinstrap penguin (Pygoscelis antarctica) | Anvers | Dream | Adult, 1 Egg Stage | N98A2 | Yes | 2009-11-19 | 55.8 | 19.8 | 207.0 | 4000.0 | MALE | 9.70465 | -24.53494 | NaN |
| 340 | PAL0910 | 65 | Chinstrap penguin (Pygoscelis antarctica) | Anvers | Dream | Adult, 1 Egg Stage | N99A1 | No | 2009-11-21 | 43.5 | 18.1 | 202.0 | 3400.0 | FEMALE | 9.37608 | -24.40753 | Nest never observed with full clutch. |
| 341 | PAL0910 | 66 | Chinstrap penguin (Pygoscelis antarctica) | Anvers | Dream | Adult, 1 Egg Stage | N99A2 | No | 2009-11-21 | 49.6 | 18.2 | 193.0 | 3775.0 | MALE | 9.46180 | -24.70615 | Nest never observed with full clutch. |
| 342 | PAL0910 | 67 | Chinstrap penguin (Pygoscelis antarctica) | Anvers | Dream | Adult, 1 Egg Stage | N100A1 | Yes | 2009-11-21 | 50.8 | 19.0 | 210.0 | 4100.0 | MALE | 9.98044 | -24.68741 | NaN |
| 343 | PAL0910 | 68 | Chinstrap penguin (Pygoscelis antarctica) | Anvers | Dream | Adult, 1 Egg Stage | N100A2 | Yes | 2009-11-21 | 50.2 | 18.7 | 198.0 | 3775.0 | FEMALE | 9.39305 | -24.25255 | NaN |
344 rows × 17 columns
preprocess_penguins_df = palmerpenguins.load_penguins()
preprocess_penguins_df
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Chinstrap | Dream | 55.8 | 19.8 | 207.0 | 4000.0 | male | 2009 |
| 340 | Chinstrap | Dream | 43.5 | 18.1 | 202.0 | 3400.0 | female | 2009 |
| 341 | Chinstrap | Dream | 49.6 | 18.2 | 193.0 | 3775.0 | male | 2009 |
| 342 | Chinstrap | Dream | 50.8 | 19.0 | 210.0 | 4100.0 | male | 2009 |
| 343 | Chinstrap | Dream | 50.2 | 18.7 | 198.0 | 3775.0 | female | 2009 |
344 rows × 8 columns
seaborn¶sns.load_dataset("penguins")
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
| 340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
| 341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
| 342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
| 343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
344 rows × 7 columns
Deepnote¶Links de importación de datos:
preprocess_penguins_df = pd.read_csv("penguins.csv")
preprocess_penguins_df
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Chinstrap | Dream | 55.8 | 19.8 | 207.0 | 4000.0 | male | 2009 |
| 340 | Chinstrap | Dream | 43.5 | 18.1 | 202.0 | 3400.0 | female | 2009 |
| 341 | Chinstrap | Dream | 49.6 | 18.2 | 193.0 | 3775.0 | male | 2009 |
| 342 | Chinstrap | Dream | 50.8 | 19.0 | 210.0 | 4100.0 | male | 2009 |
| 343 | Chinstrap | Dream | 50.2 | 18.7 | 198.0 | 3775.0 | female | 2009 |
344 rows × 8 columns
preprocess_penguins_df.dtypes
species object island object bill_length_mm float64 bill_depth_mm float64 flipper_length_mm float64 body_mass_g float64 sex object year int64 dtype: object
(
preprocess_penguins_df
.dtypes
.value_counts()
)
float64 4 object 3 int64 1 dtype: int64
preprocess_penguins_df.shape
(344, 8)
(
preprocess_penguins_df
.isnull()
.any()
)
species False island False bill_length_mm True bill_depth_mm True flipper_length_mm True body_mass_g True sex True year False dtype: bool
(
preprocess_penguins_df
.isnull()
.sum()
)
species 0 island 0 bill_length_mm 2 bill_depth_mm 2 flipper_length_mm 2 body_mass_g 2 sex 11 year 0 dtype: int64
(
preprocess_penguins_df
.isnull()
.sum()
.sum()
)
19
(
preprocess_penguins_df
.isnull()
.melt()
.pipe(
lambda df: (
sns.displot(
data=df,
y="variable",
hue="value",
multiple="fill",
aspect=2
)
)
)
)
<seaborn.axisgrid.FacetGrid at 0x7fbe102d1090>
(
preprocess_penguins_df
.isnull()
.T
.pipe(
lambda df : (
sns.heatmap(
data=df
)
)
)
)
<AxesSubplot: >
processed_penguins_df = (
preprocess_penguins_df.dropna()
)
processed_penguins_df
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
| 5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | male | 2007 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Chinstrap | Dream | 55.8 | 19.8 | 207.0 | 4000.0 | male | 2009 |
| 340 | Chinstrap | Dream | 43.5 | 18.1 | 202.0 | 3400.0 | female | 2009 |
| 341 | Chinstrap | Dream | 49.6 | 18.2 | 193.0 | 3775.0 | male | 2009 |
| 342 | Chinstrap | Dream | 50.8 | 19.0 | 210.0 | 4100.0 | male | 2009 |
| 343 | Chinstrap | Dream | 50.2 | 18.7 | 198.0 | 3775.0 | female | 2009 |
333 rows × 8 columns
processed_penguins_df.describe(include='all')
# all implica que tome las variables tanto numericas como categoricas
# [np.number] ó describe() toma en cuenta solo variables numericas
# object considera solo variables categoricas
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| count | 333 | 333 | 333.000000 | 333.000000 | 333.000000 | 333.000000 | 333 | 333.000000 |
| unique | 3 | 3 | NaN | NaN | NaN | NaN | 2 | NaN |
| top | Adelie | Biscoe | NaN | NaN | NaN | NaN | male | NaN |
| freq | 146 | 163 | NaN | NaN | NaN | NaN | 168 | NaN |
| mean | NaN | NaN | 43.992793 | 17.164865 | 200.966967 | 4207.057057 | NaN | 2008.042042 |
| std | NaN | NaN | 5.468668 | 1.969235 | 14.015765 | 805.215802 | NaN | 0.812944 |
| min | NaN | NaN | 32.100000 | 13.100000 | 172.000000 | 2700.000000 | NaN | 2007.000000 |
| 25% | NaN | NaN | 39.500000 | 15.600000 | 190.000000 | 3550.000000 | NaN | 2007.000000 |
| 50% | NaN | NaN | 44.500000 | 17.300000 | 197.000000 | 4050.000000 | NaN | 2008.000000 |
| 75% | NaN | NaN | 48.600000 | 18.700000 | 213.000000 | 4775.000000 | NaN | 2009.000000 |
| max | NaN | NaN | 59.600000 | 21.500000 | 231.000000 | 6300.000000 | NaN | 2009.000000 |
processed_penguins_df.describe(include=[np.number])
# all implica que tome las variables tanto numericas como categoricas
# [np.number] ó describe() toma en cuenta solo variables numericas
# object considera solo variables categoricas
| bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | year | |
|---|---|---|---|---|---|
| count | 333.000000 | 333.000000 | 333.000000 | 333.000000 | 333.000000 |
| mean | 43.992793 | 17.164865 | 200.966967 | 4207.057057 | 2008.042042 |
| std | 5.468668 | 1.969235 | 14.015765 | 805.215802 | 0.812944 |
| min | 32.100000 | 13.100000 | 172.000000 | 2700.000000 | 2007.000000 |
| 25% | 39.500000 | 15.600000 | 190.000000 | 3550.000000 | 2007.000000 |
| 50% | 44.500000 | 17.300000 | 197.000000 | 4050.000000 | 2008.000000 |
| 75% | 48.600000 | 18.700000 | 213.000000 | 4775.000000 | 2009.000000 |
| max | 59.600000 | 21.500000 | 231.000000 | 6300.000000 | 2009.000000 |
processed_penguins_df.describe(include=object)
| species | island | sex | |
|---|---|---|---|
| count | 333 | 333 | 333 |
| unique | 3 | 3 | 2 |
| top | Adelie | Biscoe | male |
| freq | 146 | 163 | 168 |
# definir variables de tipo category
(
processed_penguins_df
.astype({
'species': 'category',
'island': 'category',
'sex': 'category'
})
)
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
| 5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | male | 2007 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Chinstrap | Dream | 55.8 | 19.8 | 207.0 | 4000.0 | male | 2009 |
| 340 | Chinstrap | Dream | 43.5 | 18.1 | 202.0 | 3400.0 | female | 2009 |
| 341 | Chinstrap | Dream | 49.6 | 18.2 | 193.0 | 3775.0 | male | 2009 |
| 342 | Chinstrap | Dream | 50.8 | 19.0 | 210.0 | 4100.0 | male | 2009 |
| 343 | Chinstrap | Dream | 50.2 | 18.7 | 198.0 | 3775.0 | female | 2009 |
333 rows × 8 columns
(
processed_penguins_df
.species
.value_counts() #cuenta el numero de cada especie
.plot(
kind='bar' #grafica el conteo de cada especie
)
)
<AxesSubplot: >
sns.catplot(
data=processed_penguins_df,
x='species',
kind='count',
palette=penguin_color
)
<seaborn.axisgrid.FacetGrid at 0x7fbe102d3cd0>
(
processed_penguins_df
.value_counts('species', sort=True)
.reset_index(name='count')
.pipe(
lambda df: (
sns.barplot(
data=df,
x='species',
y='count',
palette=penguin_color
)
)
)
)
<AxesSubplot: xlabel='species', ylabel='count'>
(
processed_penguins_df
.add_column('x','') #añade columnas vacias al df
.pipe(
lambda df:(
sns.displot(
data=df,
x='x',
hue='species',
multiple='fill', # lo muestre en proporcion
palette=penguin_color
)
)
)
)
<seaborn.axisgrid.FacetGrid at 0x7fbe0dfd58d0>
processed_penguins_df.bill_depth_mm.mean()
17.164864864864867
np.mean(processed_penguins_df.bill_depth_mm)
17.164864864864867
processed_penguins_df.mean()
/tmp/ipykernel_3366/1618060137.py:1: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.mean()
bill_length_mm 43.992793 bill_depth_mm 17.164865 flipper_length_mm 200.966967 body_mass_g 4207.057057 year 2008.042042 dtype: float64
processed_penguins_df.median()
/tmp/ipykernel_3366/3242987746.py:1: FutureWarning: The default value of numeric_only in DataFrame.median is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.median()
bill_length_mm 44.5 bill_depth_mm 17.3 flipper_length_mm 197.0 body_mass_g 4050.0 year 2008.0 dtype: float64
processed_penguins_df.mode()
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Biscoe | 41.1 | 17.0 | 190.0 | 3800.0 | male | 2009 |
processed_penguins_df.describe(include="object")
| species | island | sex | |
|---|---|---|---|
| count | 333 | 333 | 333 |
| unique | 3 | 3 | 2 |
| top | Adelie | Biscoe | male |
| freq | 146 | 163 | 168 |
processed_penguins_df.max(numeric_only="True")
bill_length_mm 59.6 bill_depth_mm 21.5 flipper_length_mm 231.0 body_mass_g 6300.0 year 2009.0 dtype: float64
processed_penguins_df.min(numeric_only="True")
bill_length_mm 32.1 bill_depth_mm 13.1 flipper_length_mm 172.0 body_mass_g 2700.0 year 2007.0 dtype: float64
processed_penguins_df.max(numeric_only="True") - processed_penguins_df.min(numeric_only="True")
bill_length_mm 27.5 bill_depth_mm 8.4 flipper_length_mm 59.0 body_mass_g 3600.0 year 2.0 dtype: float64
processed_penguins_df.std()
/tmp/ipykernel_3366/4261057176.py:1: FutureWarning: The default value of numeric_only in DataFrame.std is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.std()
bill_length_mm 5.468668 bill_depth_mm 1.969235 flipper_length_mm 14.015765 body_mass_g 805.215802 year 0.812944 dtype: float64
processed_penguins_df.std() + processed_penguins_df.mean()
/tmp/ipykernel_3366/653160575.py:1: FutureWarning: The default value of numeric_only in DataFrame.std is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.std() + processed_penguins_df.mean() /tmp/ipykernel_3366/653160575.py:1: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.std() + processed_penguins_df.mean()
bill_length_mm 49.461461 bill_depth_mm 19.134100 flipper_length_mm 214.982732 body_mass_g 5012.272859 year 2008.854986 dtype: float64
processed_penguins_df.std() - processed_penguins_df.mean()
/tmp/ipykernel_3366/3630115265.py:1: FutureWarning: The default value of numeric_only in DataFrame.std is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.std() - processed_penguins_df.mean() /tmp/ipykernel_3366/3630115265.py:1: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.std() - processed_penguins_df.mean()
bill_length_mm -38.524124 bill_depth_mm -15.195629 flipper_length_mm -186.951202 body_mass_g -3401.841255 year -2007.229098 dtype: float64
#sns.scatterplot(
# x=x,
# y=y
#)
#fx_1 = np.array([x.min(), x.max()])
#fy_1 = res_x_y.intercept + res_x_y.slope * fx_1
#plt.plot(fx_1, fy_1)
#sns.scatterplot(
# x=y,
# y=x
#)
#fx_2 = np.array([y.min(), y.max()])
#fy_2 = res_y_x.intercept + res_y_x.slope * fx_2
#plt.plot(fx_2, fy_2)
#sns.scatterplot(
# x=x,
# y=y
#)
#plt.plot(fx_1, fy_1)
#plt.plot(fy_2, fx_2)
processed_penguins_df.quantile(0.75)
#Q3
/tmp/ipykernel_3366/221191510.py:1: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.quantile(0.75)
bill_length_mm 48.6 bill_depth_mm 18.7 flipper_length_mm 213.0 body_mass_g 4775.0 year 2009.0 Name: 0.75, dtype: float64
processed_penguins_df.quantile(0.75) - processed_penguins_df.quantile(0.25)
/tmp/ipykernel_3366/1337602098.py:1: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.quantile(0.75) - processed_penguins_df.quantile(0.25)
bill_length_mm 9.1 bill_depth_mm 3.1 flipper_length_mm 23.0 body_mass_g 1225.0 year 2.0 dtype: float64
(
processed_penguins_df
.quantile(q=[0.75,0.5,0.25])
.transpose()
.rename_axis("variable")
.reset_index()
.assign(
iqr= lambda df: df[0.75] - df[0.25]
)
)
/tmp/ipykernel_3366/2807587524.py:2: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df
| variable | 0.75 | 0.5 | 0.25 | iqr | |
|---|---|---|---|---|---|
| 0 | bill_length_mm | 48.6 | 44.5 | 39.5 | 9.1 |
| 1 | bill_depth_mm | 18.7 | 17.3 | 15.6 | 3.1 |
| 2 | flipper_length_mm | 213.0 | 197.0 | 190.0 | 23.0 |
| 3 | body_mass_g | 4775.0 | 4050.0 | 3550.0 | 1225.0 |
| 4 | year | 2009.0 | 2008.0 | 2007.0 | 2.0 |
sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm'
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
<matplotlib.lines.Line2D at 0x7fbe5654da20>
sns.boxplot(
data=processed_penguins_df,
x='flipper_length_mm'
)
<AxesSubplot: xlabel='flipper_length_mm'>
def freedman_diaconis_bindwidth(x: pd.Series) -> float:
"""Find optimal bindwidth using Freedman-Diaconis rule."""
IQR = x.quantile(0.75) - x.quantile(0.25)
N = x.size
return 2 * IQR / N ** (1 / 3)
sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm',
binwidth=2
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
<matplotlib.lines.Line2D at 0x7fbe0db764d0>
seaborn¶sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm',
binwidth=1,
stat='probability'
)
<AxesSubplot: xlabel='flipper_length_mm', ylabel='Probability'>
empiricaldist¶pmf_flipper_length_mm = empiricaldist.Pmf.from_seq(
processed_penguins_df.flipper_length_mm,
normalize=True
)
pmf_flipper_length_mm
| probs | |
|---|---|
| 172.0 | 0.003003 |
| 174.0 | 0.003003 |
| 176.0 | 0.003003 |
| 178.0 | 0.012012 |
| 180.0 | 0.012012 |
| 181.0 | 0.021021 |
| 182.0 | 0.009009 |
| 183.0 | 0.006006 |
| 184.0 | 0.021021 |
| 185.0 | 0.027027 |
| 186.0 | 0.018018 |
| 187.0 | 0.048048 |
| 188.0 | 0.018018 |
| 189.0 | 0.021021 |
| 190.0 | 0.063063 |
| 191.0 | 0.039039 |
| 192.0 | 0.021021 |
| 193.0 | 0.042042 |
| 194.0 | 0.015015 |
| 195.0 | 0.051051 |
| 196.0 | 0.030030 |
| 197.0 | 0.030030 |
| 198.0 | 0.024024 |
| 199.0 | 0.018018 |
| 200.0 | 0.012012 |
| 201.0 | 0.018018 |
| 202.0 | 0.012012 |
| 203.0 | 0.015015 |
| 205.0 | 0.009009 |
| 206.0 | 0.003003 |
| 207.0 | 0.006006 |
| 208.0 | 0.024024 |
| 209.0 | 0.015015 |
| 210.0 | 0.042042 |
| 211.0 | 0.006006 |
| 212.0 | 0.021021 |
| 213.0 | 0.018018 |
| 214.0 | 0.015015 |
| 215.0 | 0.036036 |
| 216.0 | 0.018018 |
| 217.0 | 0.015015 |
| 218.0 | 0.015015 |
| 219.0 | 0.015015 |
| 220.0 | 0.024024 |
| 221.0 | 0.015015 |
| 222.0 | 0.018018 |
| 223.0 | 0.006006 |
| 224.0 | 0.009009 |
| 225.0 | 0.012012 |
| 226.0 | 0.003003 |
| 228.0 | 0.012012 |
| 229.0 | 0.006006 |
| 230.0 | 0.021021 |
| 231.0 | 0.003003 |
pmf_flipper_length_mm.bar()
pmf_flipper_length_mm(190)
0.06306306306306306
processed_penguins_df.flipper_length_mm.max()
#empiricaldist no colapso los datos como si lo hace seaborn
231.0
seaborn¶sns.ecdfplot( data=processed_penguins_df, x='flipper_length_mm' )
empiricaldist¶cdf_flipper_length_mm = empiricaldist.Cdf.from_seq(
processed_penguins_df.flipper_length_mm,
normalize=True
)
cdf_flipper_length_mm.plot()
q = 200
p = cdf_flipper_length_mm.forward(q)
#----------
plt.vlines (
x = q,
ymin = 0,
ymax=p,
color='black',
linestyle='dashed'
)
plt.hlines(
y=p,
xmin=pmf_flipper_length_mm.qs[0],
xmax=q,
color='black',
linestyle='dashed'
)
plt.plot(q,p,'ro')
#---------
print(q,p)
#la probabilidad de que consigamos un pinguino con la longitud q es p
200 0.5675675675675675
cdf_flipper_length_mm.step()
p_1 = 8.25 # Specify probability
P_2 = 8.75
ps = ( 0.25 , 0.75 ) # IQR
qs = cdf_flipper_length_mm.inverse(ps)
plt.vlines(
x = qs,
ymin = 0,
ymax=ps,
color='black',
linestyle='dashed'
)
plt.hlines(
y=ps,
xmin=pmf_flipper_length_mm.qs[0],
xmax=qs,
color='black',
linestyle='dashed'
)
plt.scatter(
x=qs,
y=ps,
color='red',
zorder=2
)
<matplotlib.collections.PathCollection at 0x7fbe0d4e78e0>
sns.ecdfplot(
data=processed_penguins_df,
x='flipper_length_mm',
hue='species',
palette=penguin_color
)
<AxesSubplot: xlabel='flipper_length_mm', ylabel='Proportion'>
sns.kdeplot(
data=processed_penguins_df,
x='flipper_length_mm',
bw_method=0.1
)
<AxesSubplot: xlabel='flipper_length_mm', ylabel='Density'>
stats = processed_penguins_df.body_mass_g.describe()
stats
count 333.000000 mean 4207.057057 std 805.215802 min 2700.000000 25% 3550.000000 50% 4050.000000 75% 4775.000000 max 6300.000000 Name: body_mass_g, dtype: float64
xs = np.linspace(stats['min'], stats['max'])
ys = scipy.stats.norm(stats['mean'], stats['std']).cdf(xs)
plt.plot(xs, ys, color='black', linestyle='dashed')
empiricaldist.Cdf.from_seq(
processed_penguins_df.body_mass_g,
normalize=True
).plot()
<AxesSubplot: >
xs = np.linspace(stats['min'], stats['max'])
ys = scipy.stats.norm(stats['mean'], stats['std']).pdf(xs)
plt.plot(xs, ys, color='black', linestyle='dashed')
sns.kdeplot(
data=processed_penguins_df,
x='body_mass_g'
)
<AxesSubplot: xlabel='body_mass_g', ylabel='Density'>
dice = empiricaldist.Pmf.from_seq([1,2,3,4,5,6])
dice.bar()
for sample_size in (1e2, 1e3,1e4): #100 1000 10000
sample_size = int(sample_size)
values = dice.sample(sample_size) # obtener valores segun tamaño de la muestra
sample_pmf = empiricaldist.Pmf.from_seq(values) #calculo de la probabilidad para cada valor
#graficas para cada tamaño de muestra
plt.figure(figsize=(5, 5))
sample_pmf.bar()
plt.axhline(y=1/6, color='red', linestyle='dashed')
plt.ylim([0,0.50])
plt.title(f'Sample size: {sample_size}')
processed_penguins_df.sex.value_counts(normalize=True) #proporcion
#distribucion binomial, se deben usar numeros
male 0.504505 female 0.495495 Name: sex, dtype: float64
sex_numeric = processed_penguins_df.sex.replace(['male', 'female'], [1, 0])
sex_numeric
#aqui ya estan como numeros
0 1
1 0
2 0
4 0
5 1
..
339 1
340 0
341 1
342 1
343 0
Name: sex, Length: 333, dtype: int64
#para esconder el error DataFrame is highly fragmented
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
number_sample = 1000 #cantidad de muestras a tomar
sample_size = 35 #tamaño de la muestra
samples_df = pd.DataFrame() #dataframe donde se almacena las media de cada muestra
np.random.seed(42)
for i in range(1, number_sample + 1):
sex_numeric_sample = sex_numeric.sample(sample_size, replace=True).to_numpy()
sample_name = f'sample {i}'
samples_df[sample_name] = sex_numeric_sample
male_population_mean = samples_df.mean().mean()
print(f'El porcentaje de pinguinos machos en la poblacion es de: {male_population_mean*100:.4f}%')
El porcentaje de pinguinos machos en la poblacion es de: 50.1829%
sample_means_binomial =pd.DataFrame(samples_df.mean(), columns=['sample_mean'])
sns.kdeplot(data=sample_means_binomial)
plt.axvline(x=sex_numeric.mean(), color='red', linestyle='dashed')
<matplotlib.lines.Line2D at 0x7fbe0d28cbb0>
#sample_size_experiment = pd.DataFrame(
# [[i,samples_df.iloc[:, 0:].mean().mean().mean()] for i in range(1, number_sample + 1)],
# columns=['sample_size', 'estimated_mean']
#)
#sns.scatterplot(
# data=sample_size_experiment,
# x='sample_size',
# y='estimated_mean'
#)
#plt.axhline(
# y=sex_numeric.mean(),
# color='red',
# linestyle='dashed'
#)
#plt.ylim([sex_numeric.mean() - 0.20, sex_numeric.mean() + 0.20])
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
alpha=1/2,
s=100
)
<AxesSubplot: xlabel='bill_length_mm', ylabel='bill_depth_mm'>
sns.displot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
rug=True
)
<seaborn.axisgrid.FacetGrid at 0x7fbe0d426830>
sns.displot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
rug=True,
kind='kde'
)
<seaborn.axisgrid.FacetGrid at 0x7fbe0d5370a0>
sns.jointplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
)
<seaborn.axisgrid.JointGrid at 0x7fbe0d5370d0>
sns.scatterplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
<AxesSubplot: xlabel='species', ylabel='flipper_length_mm'>
sns.stripplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
<AxesSubplot: xlabel='species', ylabel='flipper_length_mm'>
ax = sns.boxplot(
data=processed_penguins_df,
x='flipper_length_mm',
y='species',
hue='species',
palette=penguin_color
)
ax = sns.stripplot(
data=processed_penguins_df,
x='flipper_length_mm',
y='species',
hue='species',
palette='dark:.3'
)
ax = sns.violinplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
color='.8'
)
ax = sns.stripplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
sns.swarmplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
<AxesSubplot: xlabel='species', ylabel='flipper_length_mm'>
processed_penguins_df.corr()
/tmp/ipykernel_3366/4090656914.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. processed_penguins_df.corr()
| bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | year | |
|---|---|---|---|---|---|
| bill_length_mm | 1.000000 | -0.228626 | 0.653096 | 0.589451 | 0.032657 |
| bill_depth_mm | -0.228626 | 1.000000 | -0.577792 | -0.472016 | -0.048182 |
| flipper_length_mm | 0.653096 | -0.577792 | 1.000000 | 0.872979 | 0.151068 |
| body_mass_g | 0.589451 | -0.472016 | 0.872979 | 1.000000 | 0.021862 |
| year | 0.032657 | -0.048182 | 0.151068 | 0.021862 | 1.000000 |
sns.heatmap(
data=processed_penguins_df.corr(numeric_only=True),
cmap=sns.diverging_palette(20,300, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidth=0.5,
annot=True
)
<AxesSubplot: >
sns.clustermap(
data=processed_penguins_df.corr(numeric_only=True),
cmap=sns.diverging_palette(20,300, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidth=0.5,
annot=True
)
<seaborn.matrix.ClusterGrid at 0x7fbe0914b940>
processed_penguins_df = (
processed_penguins_df
.assign(
numeric_sex=lambda df: df.sex.replace(['female', 'male'], [0,1])
)
)
sns.clustermap(
data=processed_penguins_df.corr(numeric_only=True),
cmap=sns.diverging_palette(20,300, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidth=0.5,
annot=True
)
<seaborn.matrix.ClusterGrid at 0x7fbe09849690>
x = np.linspace(-100,100,100)
y=x**2
y+=np.random.normal(0,1000,x.size)
sns.scatterplot(
x=x,
y=y
)
np.corrcoef(x,y)
array([[1. , 0.00269802],
[0.00269802, 1. ]])
x = np.linspace(-100,100,100)
y=x**3
y+=np.random.normal(0,1000,x.size)
sns.scatterplot(
x=x,
y=y
)
np.corrcoef(x,y)
array([[1. , 0.91665357],
[0.91665357, 1. ]])
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm'
)
#asociacion? correlacion? si no porque?
<AxesSubplot: xlabel='bill_length_mm', ylabel='bill_depth_mm'>
np.random.seed(42)
x_1 = np.linspace(0,100,100)
y_1 = 0.1 * x_1 + 3 + np.random.uniform(-2,2,size=x_1.size)
sns.scatterplot(
x=x_1,
y=y_1
)
x_2 = np.linspace(0,100,100)
y_2 = 0.5 * x_2 + 3 + np.random.uniform(0,60,size=x_2.size)
sns.scatterplot(
x=x_2,
y=y_2
)
plt.legend(labels=['1','2'])
print(np.corrcoef(x_1,y_1))
print(np.corrcoef(x_2,y_2))
[[1. 0.92761617] [0.92761617 1. ]] [[1. 0.67476343] [0.67476343 1. ]]
res_1 = scipy.stats.linregress(x=x_1,y=y_1)
res_2 = scipy.stats.linregress(x=x_2,y=y_2)
print(res_1,res_2,sep='\n') #+slope=+fuerza
LinregressResult(slope=0.1008196928097962, intercept=2.8397383330230257, rvalue=0.9276161661149585, pvalue=1.0607043467839354e-43, stderr=0.004101050284084738, intercept_stderr=0.23737141027424585) LinregressResult(slope=0.5470008424819232, intercept=30.519861265205215, rvalue=0.6747634267657533, pvalue=1.3883699878991534e-14, stderr=0.06043657503136452, intercept_stderr=3.498107570885822)
sns.scatterplot(
x=x_1,
y=y_1
)
fx_1 = np.array([x.min(), x_1.max()])
fy_1 = res_1.intercept + res_1.slope * fx_1
plt.plot(fx_1, fy_1)
sns.scatterplot(
x=x_2,
y=y_2
)
fx_2 = np.array([x_2.min(), x_2.max()])
fy_2 = res_2.intercept + res_2.slope * fx_2
plt.plot(fx_2,fx_2)
plt.legend(labels=['1','1','2','2'])
<matplotlib.legend.Legend at 0x7fbe08c35b40>
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm'
)
res_penguins = scipy.stats.linregress(
x=processed_penguins_df.bill_length_mm,
y=processed_penguins_df.bill_depth_mm
)
print(res_penguins)
fx_1 = np.array([processed_penguins_df.bill_length_mm.min(), processed_penguins_df.bill_length_mm.max()])
fy_1 = res_penguins.intercept + res_penguins.slope * fx_1
plt.plot(fx_1,fy_1)
LinregressResult(slope=-0.08232675339862278, intercept=20.78664866843383, rvalue=-0.2286256359130291, pvalue=2.5282897209443147e-05, stderr=0.01926834673577886, intercept_stderr=0.8541730787409803)
[<matplotlib.lines.Line2D at 0x7fbe08ce4fd0>]
sns.lmplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
height=10
)
<seaborn.axisgrid.FacetGrid at 0x7fbe08cfb880>
y = processed_penguins_df.bill_depth_mm
x = processed_penguins_df.bill_length_mm
res_x_y = scipy.stats.linregress(x=x, y=y)
res_y_x = scipy.stats.linregress(x=y, y=x)
print(res_x_y,res_y_x,sep='\n')
#Fig. 1
sns.scatterplot(
x=x,
y=y
)
fx_1= np.array([x.min(), x.max()])
fy_1= res_x_y.intercept + res_x_y.slope * fx_1
plt.plot(fx_1, fy_1);
LinregressResult(slope=-0.08232675339862278, intercept=20.78664866843383, rvalue=-0.2286256359130291, pvalue=2.5282897209443147e-05, stderr=0.01926834673577886, intercept_stderr=0.8541730787409803) LinregressResult(slope=-0.6349051704195023, intercept=54.89085424504756, rvalue=-0.2286256359130291, pvalue=2.5282897209443147e-05, stderr=0.14859778216623312, intercept_stderr=2.567341513538256)
#Fig. 2
sns.scatterplot(
x=y,
y=x
)
fx_2= np.array([y.min(), y.max()])
fy_2= res_y_x.intercept + res_y_x.slope * fx_2
plt.plot(fx_2, fy_2)
[<matplotlib.lines.Line2D at 0x7fbe08b67940>]
# Fig. 3
sns.scatterplot(
x=x,
y=y
)
plt.plot(fx_1, fy_1)
plt.plot(fy_2, fx_2)
[<matplotlib.lines.Line2D at 0x7fbe08ae0580>]
#statsmodels module
(
smf.ols(
formula='bill_length_mm ~ bill_depth_mm ',
data=processed_penguins_df
)
.fit()
.params
)
Intercept 54.890854 bill_depth_mm -0.634905 dtype: float64
(
smf.ols(
formula='bill_depth_mm ~ bill_length_mm',
data=processed_penguins_df
)
.fit()
.params
)
Intercept 20.786649 bill_length_mm -0.082327 dtype: float64
model_1 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm',
data=processed_penguins_df
)
.fit()
)
model_1.summary() #descripcion del modelo
| Dep. Variable: | body_mass_g | R-squared: | 0.347 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.345 |
| Method: | Least Squares | F-statistic: | 176.2 |
| Date: | Fri, 30 Jun 2023 | Prob (F-statistic): | 1.54e-32 |
| Time: | 21:14:47 | Log-Likelihood: | -2629.1 |
| No. Observations: | 333 | AIC: | 5262. |
| Df Residuals: | 331 | BIC: | 5270. |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 388.8452 | 289.817 | 1.342 | 0.181 | -181.271 | 958.961 |
| bill_length_mm | 86.7918 | 6.538 | 13.276 | 0.000 | 73.931 | 99.652 |
| Omnibus: | 6.141 | Durbin-Watson: | 0.849 |
|---|---|---|---|
| Prob(Omnibus): | 0.046 | Jarque-Bera (JB): | 4.899 |
| Skew: | -0.197 | Prob(JB): | 0.0864 |
| Kurtosis: | 2.555 | Cond. No. | 360. |
model_2 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm',
data=processed_penguins_df
)
.fit()
)
model_2.summary()
| Dep. Variable: | body_mass_g | R-squared: | 0.467 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.464 |
| Method: | Least Squares | F-statistic: | 144.8 |
| Date: | Fri, 30 Jun 2023 | Prob (F-statistic): | 7.04e-46 |
| Time: | 21:14:47 | Log-Likelihood: | -2595.2 |
| No. Observations: | 333 | AIC: | 5196. |
| Df Residuals: | 330 | BIC: | 5208. |
| Df Model: | 2 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 3413.4519 | 437.911 | 7.795 | 0.000 | 2552.002 | 4274.902 |
| bill_length_mm | 74.8126 | 6.076 | 12.313 | 0.000 | 62.860 | 86.765 |
| bill_depth_mm | -145.5072 | 16.873 | -8.624 | 0.000 | -178.699 | -112.315 |
| Omnibus: | 2.839 | Durbin-Watson: | 1.798 |
|---|---|---|---|
| Prob(Omnibus): | 0.242 | Jarque-Bera (JB): | 2.175 |
| Skew: | -0.000 | Prob(JB): | 0.337 |
| Kurtosis: | 2.604 | Cond. No. | 644. |
model_3 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm',
data=processed_penguins_df
)
.fit()
)
model_3.summary()
| Dep. Variable: | body_mass_g | R-squared: | 0.764 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.762 |
| Method: | Least Squares | F-statistic: | 354.9 |
| Date: | Fri, 30 Jun 2023 | Prob (F-statistic): | 9.26e-103 |
| Time: | 21:14:47 | Log-Likelihood: | -2459.8 |
| No. Observations: | 333 | AIC: | 4928. |
| Df Residuals: | 329 | BIC: | 4943. |
| Df Model: | 3 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -6445.4760 | 566.130 | -11.385 | 0.000 | -7559.167 | -5331.785 |
| bill_length_mm | 3.2929 | 5.366 | 0.614 | 0.540 | -7.263 | 13.849 |
| bill_depth_mm | 17.8364 | 13.826 | 1.290 | 0.198 | -9.362 | 45.035 |
| flipper_length_mm | 50.7621 | 2.497 | 20.327 | 0.000 | 45.850 | 55.675 |
| Omnibus: | 5.596 | Durbin-Watson: | 1.982 |
|---|---|---|---|
| Prob(Omnibus): | 0.061 | Jarque-Bera (JB): | 5.469 |
| Skew: | 0.312 | Prob(JB): | 0.0649 |
| Kurtosis: | 3.068 | Cond. No. | 5.44e+03 |
model_4 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm + C(sex)',
data=processed_penguins_df
)
.fit()
)
model_4.summary()
| Dep. Variable: | body_mass_g | R-squared: | 0.823 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.821 |
| Method: | Least Squares | F-statistic: | 381.3 |
| Date: | Fri, 30 Jun 2023 | Prob (F-statistic): | 6.28e-122 |
| Time: | 21:14:48 | Log-Likelihood: | -2411.8 |
| No. Observations: | 333 | AIC: | 4834. |
| Df Residuals: | 328 | BIC: | 4853. |
| Df Model: | 4 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -2288.4650 | 631.580 | -3.623 | 0.000 | -3530.924 | -1046.006 |
| C(sex)[T.male] | 541.0285 | 51.710 | 10.463 | 0.000 | 439.304 | 642.753 |
| bill_length_mm | -2.3287 | 4.684 | -0.497 | 0.619 | -11.544 | 6.886 |
| bill_depth_mm | -86.0882 | 15.570 | -5.529 | 0.000 | -116.718 | -55.459 |
| flipper_length_mm | 38.8258 | 2.448 | 15.862 | 0.000 | 34.011 | 43.641 |
| Omnibus: | 2.598 | Durbin-Watson: | 1.843 |
|---|---|---|---|
| Prob(Omnibus): | 0.273 | Jarque-Bera (JB): | 2.125 |
| Skew: | 0.062 | Prob(JB): | 0.346 |
| Kurtosis: | 2.629 | Cond. No. | 7.01e+03 |
model_5 = (
smf.ols(
formula='body_mass_g ~ flipper_length_mm + C(sex)',
#cuanto pesa un pinguino basandose en la longitud de su aleta (por sexo)
data=processed_penguins_df
)
.fit()
)
model_5.summary()
| Dep. Variable: | body_mass_g | R-squared: | 0.806 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.805 |
| Method: | Least Squares | F-statistic: | 684.8 |
| Date: | Fri, 30 Jun 2023 | Prob (F-statistic): | 3.53e-118 |
| Time: | 21:14:48 | Log-Likelihood: | -2427.2 |
| No. Observations: | 333 | AIC: | 4860. |
| Df Residuals: | 330 | BIC: | 4872. |
| Df Model: | 2 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -5410.3002 | 285.798 | -18.931 | 0.000 | -5972.515 | -4848.085 |
| C(sex)[T.male] | 347.8503 | 40.342 | 8.623 | 0.000 | 268.491 | 427.209 |
| flipper_length_mm | 46.9822 | 1.441 | 32.598 | 0.000 | 44.147 | 49.817 |
| Omnibus: | 0.262 | Durbin-Watson: | 1.710 |
|---|---|---|---|
| Prob(Omnibus): | 0.877 | Jarque-Bera (JB): | 0.376 |
| Skew: | 0.051 | Prob(JB): | 0.829 |
| Kurtosis: | 2.870 | Cond. No. | 2.95e+03 |
models_result = pd.DataFrame(
dict(
actual_value = processed_penguins_df.body_mass_g,
prediction_model_1 = model_1.predict(),
prediction_model_2 = model_2.predict(),
prediction_model_3 = model_3.predict(),
prediction_model_4 = model_4.predict(),
prediction_model_5 = model_5.predict(),
species=processed_penguins_df.species,
sex=processed_penguins_df.sex
)
)
models_result
| actual_value | prediction_model_1 | prediction_model_2 | prediction_model_3 | prediction_model_4 | prediction_model_5 | species | sex | |
|---|---|---|---|---|---|---|---|---|
| 0 | 3750.0 | 3782.402961 | 3617.641192 | 3204.761227 | 3579.136946 | 3441.323750 | Adelie | male |
| 1 | 3800.0 | 3817.119665 | 3836.725580 | 3436.701722 | 3343.220772 | 3328.384372 | Adelie | female |
| 2 | 3250.0 | 3886.553073 | 3809.271371 | 3906.897032 | 3639.137335 | 3751.223949 | Adelie | female |
| 4 | 3450.0 | 3574.102738 | 3350.786581 | 3816.705772 | 3457.954243 | 3657.259599 | Adelie | female |
| 5 | 3650.0 | 3799.761313 | 3356.140070 | 3696.168128 | 3764.536023 | 3864.163327 | Adelie | male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | 4000.0 | 5231.825347 | 4706.954140 | 4599.187485 | 4455.022405 | 4662.860306 | Chinstrap | male |
| 340 | 3400.0 | 4164.286703 | 4034.121055 | 4274.552753 | 3894.857519 | 4080.099176 | Chinstrap | female |
| 341 | 3775.0 | 4693.716437 | 4475.927353 | 3839.563668 | 4063.639819 | 4005.109853 | Chinstrap | male |
| 342 | 4100.0 | 4797.866549 | 4449.296758 | 4720.740455 | 4652.013882 | 4803.806832 | Chinstrap | male |
| 343 | 3775.0 | 4745.791493 | 4448.061337 | 4104.268240 | 3672.299099 | 3892.170475 | Chinstrap | female |
333 rows × 8 columns
sns.ecdfplot(
data=models_result.select_columns(['actual_value', 'prediction_model_5'])
)
<AxesSubplot: ylabel='Proportion'>
#calcular las densidades
sns.kdeplot(
data=models_result,
cumulative=True #genera curvas suavizadas de tipo acumulativas
)
<AxesSubplot: ylabel='Density'>
sns.lmplot(
data=processed_penguins_df,
x='flipper_length_mm',
y='body_mass_g',
hue='sex',
height=10
)
<seaborn.axisgrid.FacetGrid at 0x7fbe089bf6a0>
#Modelo logistico de la variable sexo VS longitud de las aletas, ancho del pico, largo del pico, y la isla
smf.logit(
formula='numeric_sex ~ flipper_length_mm + bill_length_mm + bill_depth_mm + C(island)',
data=processed_penguins_df
).fit().summary()
Optimization terminated successfully.
Current function value: 0.360900
Iterations 7
| Dep. Variable: | numeric_sex | No. Observations: | 333 |
|---|---|---|---|
| Model: | Logit | Df Residuals: | 327 |
| Method: | MLE | Df Model: | 5 |
| Date: | Fri, 30 Jun 2023 | Pseudo R-squ.: | 0.4793 |
| Time: | 21:21:23 | Log-Likelihood: | -120.18 |
| converged: | True | LL-Null: | -230.80 |
| Covariance Type: | nonrobust | LLR p-value: | 8.021e-46 |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -61.4464 | 6.944 | -8.849 | 0.000 | -75.057 | -47.836 |
| C(island)[T.Dream] | -1.5596 | 0.493 | -3.163 | 0.002 | -2.526 | -0.593 |
| C(island)[T.Torgersen] | -1.0323 | 0.599 | -1.725 | 0.085 | -2.205 | 0.141 |
| flipper_length_mm | 0.1393 | 0.024 | 5.874 | 0.000 | 0.093 | 0.186 |
| bill_length_mm | 0.1413 | 0.045 | 3.150 | 0.002 | 0.053 | 0.229 |
| bill_depth_mm | 1.6401 | 0.185 | 8.864 | 0.000 | 1.277 | 2.003 |
# Restamos los valores de probabilidad de cada isla
-1.55 - (-1.03)
-0.52
#Tabla de conteo de las variables categoricas isla y sexo
(
processed_penguins_df
.value_counts(['island', 'sex'])
.reset_index(name='count')
)
| island | sex | count | |
|---|---|---|---|
| 0 | Biscoe | male | 83 |
| 1 | Biscoe | female | 80 |
| 2 | Dream | male | 62 |
| 3 | Dream | female | 61 |
| 4 | Torgersen | female | 24 |
| 5 | Torgersen | male | 23 |
processed_penguins_df.species.unique()
array(['Adelie', 'Gentoo', 'Chinstrap'], dtype=object)
processed_penguins_df = (
processed_penguins_df
.assign(is_adelie=lambda df: df.species.replace(['Adelie', 'Gentoo', 'Chinstrap'], [1,0,0]))
)
#Modelo para determinar si un pinguino es adelie segun su sexo y el largo de las aletas
model_is_adele = smf.logit(
formula='is_adelie ~ flipper_length_mm + C(sex)',
data=processed_penguins_df
).fit()
model_is_adele.params #muestra solo los parametros del modelo
Optimization terminated successfully.
Current function value: 0.355225
Iterations 8
Intercept 40.568368 C(sex)[T.male] 1.282656 flipper_length_mm -0.209705 dtype: float64
is_adelie_df_prediction = pd.DataFrame(
dict(
actual_adelie = processed_penguins_df.is_adelie,
predicted_values = model_is_adele.predict().round() #round indica solo si es cero o uno
)
)
is_adelie_df_prediction
| actual_adelie | predicted_values | |
|---|---|---|
| 0 | 1 | 1.0 |
| 1 | 1 | 1.0 |
| 2 | 1 | 0.0 |
| 4 | 1 | 1.0 |
| 5 | 1 | 1.0 |
| ... | ... | ... |
| 339 | 0 | 0.0 |
| 340 | 0 | 0.0 |
| 341 | 0 | 1.0 |
| 342 | 0 | 0.0 |
| 343 | 0 | 0.0 |
333 rows × 2 columns
(
is_adelie_df_prediction
.value_counts(['actual_adelie', 'predicted_values'])
.reset_index(name='count')
)
| actual_adelie | predicted_values | count | |
|---|---|---|---|
| 0 | 0 | 0.0 | 151 |
| 1 | 1 | 1.0 | 129 |
| 2 | 0 | 1.0 | 36 |
| 3 | 1 | 0.0 | 17 |
print(
sklearn.metrics.confusion_matrix(
is_adelie_df_prediction.actual_adelie,
is_adelie_df_prediction.predicted_values
)
)
sklearn.metrics.accuracy_score(
is_adelie_df_prediction.actual_adelie,
is_adelie_df_prediction.predicted_values
)
0.8408408408408409 #Efectividad de prediccion del modelo
[[151 36] [ 17 129]]
0.8408408408408409
sns.lmplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm'
)
<seaborn.axisgrid.FacetGrid at 0x7fbe08be7d30>
sns.lmplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
hue='species',
palette=penguin_color
)
<seaborn.axisgrid.FacetGrid at 0x7fbe0305cc70>
#Exploracion visual de los datos con seaborn
sns.pairplot(
data=processed_penguins_df,
hue='species',
palette=penguin_color
)
<seaborn.axisgrid.PairGrid at 0x7fbe030da3e0>
session_info.show()
----- empiricaldist NA janitor 0.23.1 matplotlib 3.6.0 numpy 1.23.4 palmerpenguins 0.1.4 pandas 1.5.1 scipy 1.10.1 seaborn 0.12.1 session_info 1.0.0 sklearn 1.2.2 statsmodels 0.13.5 -----
PIL 9.5.0 anyio NA arrow 1.2.3 asttokens NA attr 23.1.0 babel 2.12.1 backcall 0.2.0 certifi 2023.05.07 cffi 1.15.1 charset_normalizer 3.1.0 comm 0.1.3 contourpy 1.1.0 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 debugpy 1.6.7 decorator 5.1.1 defusedxml 0.7.1 executing 1.2.0 fastjsonschema NA fqdn NA idna 3.4 ipykernel 6.23.3 isoduration NA jedi 0.18.2 jinja2 3.1.2 joblib 1.2.0 json5 NA jsonpointer 2.4 jsonschema 4.17.3 jupyter_events 0.6.3 jupyter_server 2.6.0 jupyterlab_server 2.23.0 kiwisolver 1.4.4 lazy_loader NA markupsafe 2.1.3 matplotlib_inline 0.1.6 mpl_toolkits NA multipledispatch 0.6.0 natsort 8.4.0 nbformat 5.9.0 overrides NA packaging 23.1 pandas_flavor NA parso 0.8.3 patsy 0.5.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 3.8.0 prometheus_client NA prompt_toolkit 3.0.38 psutil 5.9.5 ptyprocess 0.7.0 pure_eval 0.2.2 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.15.1 pyparsing 3.1.0 pyrsistent NA pythonjsonlogger NA pytz 2023.3 requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 send2trash NA sitecustomize NA six 1.16.0 sniffio 1.3.0 stack_data 0.6.2 threadpoolctl 3.1.0 tornado 6.3.2 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 2.0.3 wcwidth 0.2.6 webcolors 1.13 websocket 1.6.1 xarray 2023.6.0 yaml 6.0 zmq 25.1.0
----- IPython 8.14.0 jupyter_client 8.3.0 jupyter_core 5.3.1 jupyterlab 4.0.2 ----- Python 3.10.6 (main, May 29 2023, 11:10:38) [GCC 11.3.0] Linux-5.15.0-76-generic-x86_64-with-glibc2.35 ----- Session information updated at 2023-06-30 21:14